Loading and Summarizing Metadata Info

To start off, we need to access the metadata that we have. It’s currently in the form of an rds file (created by the R script ‘preparing_2020_T2D_metadata.R’).

metadata_df <- readRDS('final_metadata.rds')
# Add summary statistics for the data (across cohorts and separately)
skim(metadata_df)
Data summary
Name metadata_df
Number of rows 664
Number of columns 15
_______________________
Column type frequency:
character 12
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
dataset 0 1.00 3 8 0 3 0
PatientID 43 0.94 7 19 0 621 0
sampleID 26 0.96 2 9 0 533 0
disease 69 0.90 1 1 0 2 0
age 69 0.90 2 18 0 301 0
Gender 45 0.93 4 6 0 2 0
country 50 0.92 3 3 0 12 0
Ethnicity 51 0.92 1 7 0 16 0
BMI 69 0.90 2 18 0 423 0
HDL 58 0.91 1 19 0 289 0
LDL 56 0.92 1 19 0 408 0
TGL 53 0.92 1 19 0 372 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CHOL 85 0.87 9.49 3.02 2.21 6.87 10.02 11.62 19.55 ▃▃▇▂▁
CR 418 0.37 41.49 36.15 0.55 0.94 58.00 69.00 182.00 ▇▇▂▁▁
HSCRP 455 0.31 3.89 8.32 0.00 0.93 1.64 3.53 99.22 ▇▁▁▁▁
skim(metadata_df, age)
Data summary
Name metadata_df
Number of rows 664
Number of columns 15
_______________________
Column type frequency:
character 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
age 69 0.9 2 18 0 301 0

Visualizing the Metadata

Location of Samples

Location of Samples

Location of Samples

Age

tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(age != "NA")
tibbleData$age <- as.numeric(tibbleData$age)

ggplot(tibbleData, aes(x=age, fill = dataset)) +
  geom_histogram(binwidth = 2) + 
  theme_light() +  
  scale_x_continuous(name ="Age (years)", 
                    breaks = scales::pretty_breaks(n = 5)) 

ggsave("Metadata_Figures/age_histogram.png")
## Saving 7 x 5 in image
tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(age != "NA")
tibbleData$age <- as.numeric(tibbleData$age)

ggplot(tibbleData, aes(y=age, x= dataset)) +
  geom_boxplot(aes(color = dataset)) + 
  theme_light() 

ggsave("Metadata_Figures/age_boxplot.png")
## Saving 7 x 5 in image

Gender

tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(Gender != "NA")

ggplot(tibbleData, aes(x=dataset, fill = Gender)) +
  geom_bar(position=position_dodge()) + 
  theme_light() 

ggsave("Metadata_Figures/Gender_barplot.png")
## Saving 7 x 5 in image

Disease Condition

tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(disease != "NA")

ggplot(tibbleData, aes(x=dataset, fill = disease)) +
  geom_bar(position=position_dodge()) + 
  scale_fill_brewer(palette = "Blues",
                     labels = c("Control", "Type 2 Diabetes (T2D)")) +
  theme_light() 

ggsave("Metadata_Figures/disease_barplot.png")
## Saving 7 x 5 in image

Body-Mass Index (BMI)

tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(BMI != "NA")
tibbleData$BMI <- as.numeric(tibbleData$BMI)

ggplot(tibbleData, aes(x=BMI, fill=dataset, color=dataset)) +
  geom_histogram() + 
  theme_light() + 
  xlab('BMI (kg/m2)') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("Metadata_Figures/BMI_histogram.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Body-Mass Index (BMI) of just patients with T2D

tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(age != "NA")
tibbleData$BMI <- as.numeric(tibbleData$BMI)
tibbleData <- tibbleData %>%
  filter(disease == 0)

ggplot(tibbleData, aes(y=BMI, x= dataset)) +
  geom_boxplot(aes(color = dataset)) + 
  theme_light() 

ggsave("Metadata_Figures/BMI_Filtered_boxplot.png")
## Saving 7 x 5 in image

Cholesterol

Units: (mmol/L)

“Cholesterol is a waxy, fat-like substance that’s found in all the cells in your body. Your body needs some cholesterol to make hormones, vitamin D, and substances that help you digest foods. Your body makes all the cholesterol it needs.” (taken from https://medlineplus.gov/cholesterol.html)

metadata_df <- readRDS('final_metadata.rds')
tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(CHOL != "NA")
tibbleData$CHOL <- as.numeric(tibbleData$CHOL)
tibbleData$CHOL <- log(tibbleData$CHOL)

ggplot(tibbleData, aes(x=CHOL, color = dataset, fill = dataset)) +
  geom_histogram() + 
  theme_light() + 
  xlab('log(CHOL) (mmol/L)') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("Metadata_Figures/CHOL_histogram.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HDL

Units: mmol/L (non-US measurements)

“HDL cholesterol is often called”good" cholesterol. The test for HDL cholesterol measures the amount of HDL-C in blood. High levels of cholesterol have been shown to be associated with the development of hardening of the arteries (atherosclerosis) and heart disease" (taken from https://labtestsonline.org/tests/hdl-cholesterol)

metadata_df <- readRDS('final_metadata.rds')
tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(HDL != "NA")
tibbleData$HDL <- as.numeric(tibbleData$HDL)
tibbleData$HDL <- log(tibbleData$HDL)

ggplot(tibbleData, aes(x=HDL, color = dataset, fill = dataset)) +
  geom_histogram() + 
  theme_light() + 
  xlab('log(HDL)(mmol/L)') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("Metadata_Figures/HDL_histogram.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

LDL

Units: mmol/L (non-US measurements)

“LDL stands for low-density lipoproteins. It is sometimes called the”bad" cholesterol because a high LDL level leads to a buildup of cholesterol in your arteries. HDL stands for high-density lipoproteins." (taken from https://medlineplus.gov/ldlthebadcholesterol.html)

metadata_df <- readRDS('final_metadata.rds')
tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(LDL != "NA")
tibbleData$LDL <- as.numeric(tibbleData$LDL)
tibbleData$LDL <- log(tibbleData$LDL)

ggplot(tibbleData, aes(x = LDL, color=dataset,fill = dataset)) +
  geom_histogram() + 
  theme_light() + 
  xlab('LDL(mmol/L) (log)') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("Metadata_Figures/LDL_histogram.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HSCRP

Units: mg/L (non-US measurements)

“The hsCRP test is a highly sensitive quantification of CRP, an acute-phase protein released into the blood by the liver during inflammation, which has been associated with the presence of heart disease… Elevated hsCRP is associated with the risk of future adverse cardiovascular events (heart attack, stroke and death) in apparently healthy individuals1,3 and in individuals with stable coronary artery disease.” (taken from https://www.clevelandheartlab.com/wp-content/uploads/2013/09/hsCRP-Practitioner-One-Pager-CHL-D009b.pdf)

metadata_df <- readRDS('final_metadata.rds')
tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(HSCRP != "NA")
tibbleData$HSCRP <- as.numeric(tibbleData$HSCRP)
tibbleData$HSCRP <- log(tibbleData$HSCRP)

ggplot(tibbleData, aes(color=dataset, fill = dataset, x=HSCRP)) +
  geom_histogram() + 
  theme_light() + 
  xlab('HSCRP (log) (mg/L)') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("Metadata_Figures/HSCRP_histogram.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Triglycerides

Units: mmol/L (non-US measurements)

"Triglycerides are a type of fat (lipid) found in your blood.

When you eat, your body converts any calories it doesn’t need to use right away into triglycerides. The triglycerides are stored in your fat cells. Later, hormones release triglycerides for energy between meals.

If you regularly eat more calories than you burn, particularly from high-carbohydrate foods, you may have high triglycerides (hypertriglyceridemia)." (taken from https://www.mayoclinic.org/diseases-conditions/high-blood-cholesterol/in-depth/triglycerides/art-20048186)

metadata_df <- readRDS('final_metadata.rds')
tibbleData = as_tibble(metadata_df)
tibbleData <- tibbleData %>% filter(TGL != "NA")
tibbleData$TGL <- as.numeric(tibbleData$TGL)
tibbleData$TGL <- log(tibbleData$TGL)

ggplot(tibbleData, aes(x=TGL, color = dataset, fill = dataset)) +
  geom_histogram() + 
  theme_light() + 
  xlab('TGL (log) (mmol/L)') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("Metadata_Figures/TGL_histogram.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.